{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.retrievers import ParentDocumentRetriever\n",
    "\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from langchain.document_loaders import PyPDFLoader\n",
    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
    "from langchain.vectorstores import Chroma\n",
    "from langchain.storage import InMemoryStore\n",
    "\n",
    "# Load pdf\n",
    "loader = PyPDFLoader(\"E:\\\\langchain_RAG\\\\data\\\\baichuan.pdf\")\n",
    "data = loader.load()\n",
    "\n",
    "# Split\n",
    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)\n",
    "splits = text_splitter.split_documents(data[:6])\n",
    "\n",
    "import os\n",
    "from getpass import getpass\n",
    "\n",
    "OPENAI_API_KEY = getpass()\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 检索完整的文档"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This text splitter is used to create the child documents\n",
    "child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)\n",
    "# The vectorstore to use to index the child chunks\n",
    "vectorstore = Chroma(\n",
    "    collection_name=\"full_documents\", embedding_function=OpenAIEmbeddings()\n",
    ")\n",
    "# The storage layer for the parent documents\n",
    "store = InMemoryStore()\n",
    "retriever = ParentDocumentRetriever(\n",
    "    vectorstore=vectorstore,\n",
    "    docstore=store,\n",
    "    child_splitter=child_splitter,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "retriever.add_documents(data[:6], ids=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(list(store.yield_keys()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_docs = vectorstore.similarity_search(\"What is baichuan2 ？\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_docs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(sub_docs[0].page_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "retrieved_docs = retriever.get_relevant_documents(\"What is baichuan2 ？\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(retrieved_docs[0].page_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(retrieved_docs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####  检索较大的文本块"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This text splitter is used to create the parent documents\n",
    "parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)\n",
    "# This text splitter is used to create the child documents\n",
    "# It should create documents smaller than the parent\n",
    "child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)\n",
    "# The vectorstore to use to index the child chunks\n",
    "vectorstore = Chroma(\n",
    "    collection_name=\"split_parents\", embedding_function=OpenAIEmbeddings()\n",
    ")\n",
    "# The storage layer for the parent documents\n",
    "store = InMemoryStore()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "retriever = ParentDocumentRetriever(\n",
    "    vectorstore=vectorstore,\n",
    "    docstore=store,\n",
    "    child_splitter=child_splitter,\n",
    "    parent_splitter=parent_splitter,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "retriever.add_documents(data[:6], ids=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(list(store.yield_keys()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_docs = vectorstore.similarity_search(\"What is baichuan2 ？\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(sub_docs[0].page_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "retrieved_docs = retriever.get_relevant_documents(\"What is baichuan2 ？\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(retrieved_docs[0].page_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "langchain",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
